library(data.table)
library(dplyr)
library(ggplot2)
library(grid)
library(plotly)
library(tibble)
library(stringr)
wine_dset <- read.csv2('BaseWine_Red_e_White2018.csv')
glimpse(wine_dset)
## Observations: 6,497
## Variables: 14
## $ id_vinho <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...
## $ fixedacidity <dbl> 6.6, 6.7, 10.6, 5.4, 6.7, 6.8, 6.6, 7.2, 5....
## $ volatileacidity <dbl> 0.240, 0.340, 0.310, 0.180, 0.300, 0.500, 0...
## $ citricacid <dbl> 0.35, 0.43, 0.49, 0.24, 0.44, 0.11, 0.00, 0...
## $ residualsugar <dbl> 7.70, 1.60, 2.20, 4.80, 18.75, 1.50, 1.60, ...
## $ chlorides <dbl> 0.031, 0.041, 0.063, 0.041, 0.057, 0.075, 0...
## $ freesulfurdioxide <dbl> 36, 29, 18, 30, 65, 16, 4, 34, 46, 58, 54, ...
## $ totalsulfurdioxide <dbl> 135, 114, 40, 113, 224, 49, 8, 102, 113, 18...
## $ density <dbl> 0.99380, 0.99014, 0.99760, 0.99445, 0.99956...
## $ pH <dbl> 3.19, 3.23, 3.14, 3.42, 3.11, 3.36, 3.33, 3...
## $ sulphates <dbl> 0.37, 0.44, 0.51, 0.40, 0.53, 0.79, 0.37, 0...
## $ alcohol <dbl> 10.50, 12.60, 9.80, 9.40, 9.10, 9.50, 10.40...
## $ quality <int> 5, 6, 6, 6, 5, 5, 4, 6, 7, 6, 5, 6, 6, 6, 6...
## $ Vinho <fct> WHITE, WHITE, RED, WHITE, WHITE, RED, RED, ...
wine_dset %>%
group_by(Vinho) %>%
count()
## # A tibble: 2 x 2
## # Groups: Vinho [2]
## Vinho n
## <fct> <int>
## 1 RED 1599
## 2 WHITE 4898
wine_white_dset <-
wine_dset %>%
filter(Vinho == 'WHITE')
wine_red_dset <-
wine_dset %>%
filter(Vinho == 'RED')
sum(is.na(wine_white_dset))
## [1] 0
sum(is.na(wine_red_dset))
## [1] 0
O vinho branco e vermelho apresentam características diferentes que definem se ele é bom ou ruim. Vamos dar uma olhada nos dados pra cada característica:
Vinho Branco
summary(wine_white_dset)
## id_vinho fixedacidity volatileacidity citricacid
## Min. : 1 Min. : 3.800 Min. :0.0800 Min. :0.0000
## 1st Qu.:1650 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700
## Median :3310 Median : 6.800 Median :0.2600 Median :0.3200
## Mean :3284 Mean : 6.855 Mean :0.2782 Mean :0.3342
## 3rd Qu.:4932 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900
## Max. :6497 Max. :14.200 Max. :1.1000 Max. :1.6600
## residualsugar chlorides freesulfurdioxide totalsulfurdioxide
## Min. : 0.600 Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.: 1.700 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median : 5.200 Median :0.04300 Median : 34.00 Median :134.0
## Mean : 6.387 Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.: 9.900 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :45.800 Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0140 Max. :3.820 Max. :1.0800 Max. :14.20
## quality Vinho
## Min. :3.000 RED : 0
## 1st Qu.:5.000 WHITE:4898
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
Vinho Vermelho
summary(wine_red_dset)
## id_vinho fixedacidity volatileacidity citricacid
## Min. : 3 Min. : 4.60 Min. :0.1200 Min. :0.000
## 1st Qu.:1523 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
## Median :3103 Median : 7.90 Median :0.5200 Median :0.260
## Mean :3141 Mean : 8.32 Mean :0.5278 Mean :0.271
## 3rd Qu.:4690 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
## Max. :6490 Max. :15.90 Max. :1.5800 Max. :1.000
## residualsugar chlorides freesulfurdioxide totalsulfurdioxide
## Min. : 0.900 Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median : 2.200 Median :0.07900 Median :14.00 Median : 38.00
## Mean : 2.539 Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :15.500 Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 0.9567
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.5000
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.2000
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.4001
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.1000
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.9000
## quality Vinho
## Min. :3.000 RED :1599
## 1st Qu.:5.000 WHITE: 0
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
Medianas - Tinto dataset
## fixedacidity
# wine_dset %>%
# ggplot(aes(fixedacidity, fill = Vinho)) +
# geom_boxplot()
# wine_Cmedian
# ou
# > sapply(teste_matrix, median)
# ou
# > sort(teste_matrix$ColunaB)
median_red_dset <- sapply(select(wine_red_dset, -c(Vinho, id_vinho, quality)), median)
median_red_dset <- as.data.frame(median_red_dset)
`colnames<-`(median_red_dset, "Median")
## Median
## fixedacidity 7.90000
## volatileacidity 0.52000
## citricacid 0.26000
## residualsugar 2.20000
## chlorides 0.07900
## freesulfurdioxide 14.00000
## totalsulfurdioxide 38.00000
## density 0.99675
## pH 3.31000
## sulphates 0.62000
## alcohol 10.20000
Medianas - Branco dataset
median_white_dset <- sapply(select(wine_white_dset, -c(Vinho, id_vinho, quality)),median)
median_white_dset <- as.data.frame(median_white_dset)
`colnames<-`(median_white_dset, "Median")
## Median
## fixedacidity 6.80000
## volatileacidity 0.26000
## citricacid 0.32000
## residualsugar 5.20000
## chlorides 0.04300
## freesulfurdioxide 34.00000
## totalsulfurdioxide 134.00000
## density 0.99374
## pH 3.18000
## sulphates 0.47000
## alcohol 10.40000
Diferença das medianas entre os dois tipos de vinhos
Cmedian_differenc <- abs(median_white_dset - median_red_dset)
Cmedian_differenc <- `colnames<-`(Cmedian_differenc, "Mediana_Difer")
Cmedian_differenc <- rownames_to_column(Cmedian_differenc)
Cmedian_differenc %>%
arrange(desc(Mediana_Difer)) %>%
rename(Caracteristica = rowname)
## Caracteristica Mediana_Difer
## 1 totalsulfurdioxide 96.00000
## 2 freesulfurdioxide 20.00000
## 3 residualsugar 3.00000
## 4 fixedacidity 1.10000
## 5 volatileacidity 0.26000
## 6 alcohol 0.20000
## 7 sulphates 0.15000
## 8 pH 0.13000
## 9 citricacid 0.06000
## 10 chlorides 0.03600
## 11 density 0.00301
Nota: a ordem descendente dessas características será utilizada nos plots para uma melhor visualização:
Multiplot function
# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
plot_ly(wine_dset, y = ~totalsulfurdioxide,type = "box",
color = ~Vinho, colors = c("red", "khaki")) %>%
layout(title = "Total sulfurdioxide")
plot_ly(wine_dset, y = ~freesulfurdioxide,type = "box",
color = ~Vinho, colors = c("red", "khaki")) %>%
layout(title = "Total sulfurdioxide")
plot_ly(wine_dset, y = ~residualsugar,type = "box",
color = ~Vinho, colors = c("red", "khaki")) %>%
layout(title = "Total sulfurdioxide")
plot_ly(wine_dset, y = ~fixedacidity,type = "box",
color = ~Vinho, colors = c("red", "khaki")) %>%
layout(title = "Total sulfurdioxide")
Notando a media de cada característica, vemos uma diferença considerável pra cada vinho (Branco ou Vermelho), portanto, vamos pegar somente um tipo de vinho para uma análise coerente.
Como o número de dados pros vinhos brancos é bem maior do que o para os vermelhos (aprox. 3 vezes maior), seria interessante utilizar os dados que oferecem mais amostras pra treinar e validar nosso modelo.